{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<center>\n",
    "<img src=\"../../img/ods_stickers.jpg\">\n",
    "## Открытый курс по машинному обучению. Сессия № 2\n",
    "Автор материала: программист-исследователь Mail.ru Group, старший преподаватель Факультета Компьютерных Наук ВШЭ Юрий Кашницкий. Материал распространяется на условиях лицензии [Creative Commons CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/). Можно использовать в любых целях (редактировать, поправлять и брать за основу), кроме коммерческих, но с обязательным упоминанием автора материала."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# <center>Тема 10. Бустинг\n",
    "## <center> Часть 10. Продвинутые методы работы с категориальными признаками и CatBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "pd.set_option(\"display.max.columns\", 100)\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import roc_auc_score\n",
    "from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Считаем данные и посмотрим на первые несколько строк. Видим, что у нас тут немало категориальных признаков."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(\"../../data/bank.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Всего 9 признаков со строковыми значениями."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.columns[df.dtypes == \"object\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Без категориальных признаков\n",
    "Попытаемся сначала просто проигнорировать категориальные признаки. Обучим случайный лес и посмотрим на ROC AUC на кросс-валидации и на отоженной выборке. Это будет наш бейзлайн. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_no_cat, y = df.loc[:, df.dtypes != \"object\"].drop(\"y\", axis=1), df[\"y\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_no_cat_part, df_no_cat_valid, y_train_part, y_valid = train_test_split(\n",
    "    df_no_cat, y, test_size=0.3, stratify=y, random_state=17\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forest = RandomForestClassifier(random_state=17)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(\n",
    "    cross_val_score(forest, df_no_cat_part, y_train_part, cv=skf, scoring=\"roc_auc\")\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forest.fit(df_no_cat_part, y_train_part)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "roc_auc_score(y_valid, forest.predict_proba(df_no_cat_valid)[:, 1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## LabelEncoder для категориальных признаков\n",
    "Сделаем то же самое, но попробуем закодировать категориальные признаки по-простому: с помощью `LabelEncoder`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_encoder = LabelEncoder()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cat_label_enc = df.copy().drop(\"y\", axis=1)\n",
    "for col in df.columns[df.dtypes == \"object\"]:\n",
    "    df_cat_label_enc[col] = label_encoder.fit_transform(df_cat_label_enc[col])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cat_label_enc.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cat_label_enc_part, df_cat_label_enc_valid = train_test_split(\n",
    "    df_cat_label_enc, test_size=0.3, stratify=y, random_state=17\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(\n",
    "    cross_val_score(\n",
    "        forest, df_cat_label_enc_part, y_train_part, cv=skf, scoring=\"roc_auc\"\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forest.fit(df_cat_label_enc_part, y_train_part)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "roc_auc_score(y_valid, forest.predict_proba(df_cat_label_enc_valid)[:, 1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Бинаризация категориальных признаков (dummies, OHE)\n",
    "Теперь сделаем то, что обычно по умолчанию и делают – бинаризацию категориальных признаков. Dummy-признаки, One-Hot Encoding... с небольшими различиями это об одном же - для каждого значения каждого категориального признака завести свой бинарный признак."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cat_dummies = pd.get_dummies(df, columns=df.columns[df.dtypes == \"object\"]).drop(\n",
    "    \"y\", axis=1\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cat_dummies.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_cat_dummies_part, df_cat_dummies_valid = train_test_split(\n",
    "    df_cat_dummies, test_size=0.3, stratify=y, random_state=17\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(\n",
    "    cross_val_score(\n",
    "        forest, df_cat_dummies_part, y_train_part, cv=skf, scoring=\"roc_auc\"\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forest.fit(df_cat_dummies_part, y_train_part)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "roc_auc_score(y_valid, forest.predict_proba(df_cat_dummies_valid)[:, 1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Попарные взаимодействия признаков\n",
    "Пока лес все еще лучше регрессии (хотя мы не тюнили гиперпараметры, но и не будем). Мы хотим идти дальше. Мощной техникой для работы с категориальными признаками будет учет попарных взаимодействий признаков (feature interactions). Построим попарные взаимодействия всех признаков. Вообще тут можно пойти дальше и строить взаимодействия трех и более признаков. Owen Zhang [как-то строил](https://www.youtube.com/watch?v=LgLcfZjNF44) даже 7-way interactions. Чего не сделаешь ради победы на Kaggle! :)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_interact = df.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_features = df.columns[df.dtypes == \"object\"]\n",
    "for i, col1 in enumerate(cat_features):\n",
    "    for j, col2 in enumerate(cat_features[i + 1 :]):\n",
    "        df_interact[col1 + \"_\" + col2] = df_interact[col1] + \"_\" + df_interact[col2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_interact.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_interact.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Бинаризация категориальных признаков (dummies, OHE) + попарные взаимодействия\n",
    "Получилось аж 824 бинарных признака – многовато для такой задачи, и тут случайный лес начинает не справляться, да и логистическая регрессия сработала хуже, чем в прошлый раз."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_interact_cat_dummies = pd.get_dummies(\n",
    "    df_interact, columns=df_interact.columns[df_interact.dtypes == \"object\"]\n",
    ").drop(\"y\", axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_interact_cat_dummies.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_interact_cat_dummies_part, df_interact_cat_dummies_valid = train_test_split(\n",
    "    df_interact_cat_dummies, test_size=0.3, stratify=y, random_state=17\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(\n",
    "    cross_val_score(\n",
    "        forest, df_interact_cat_dummies_part, y_train_part, cv=skf, scoring=\"roc_auc\"\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forest.fit(df_interact_cat_dummies_part, y_train_part)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "roc_auc_score(y_valid, forest.predict_proba(df_interact_cat_dummies_valid)[:, 1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Случайному лесу уже тяжеловато, когда признаков так много, а вот логистической регрессии – норм."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "logit = LogisticRegression(random_state=17)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(\n",
    "    cross_val_score(\n",
    "        logit, df_interact_cat_dummies_part, y_train_part, cv=skf, scoring=\"roc_auc\"\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "logit.fit(df_interact_cat_dummies_part, y_train_part)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "roc_auc_score(y_valid, logit.predict_proba(df_interact_cat_dummies_valid)[:, 1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Mean Target\n",
    "Теперь будем использовать технику кодирования категориальных признаков средним значением целевого признака. Это очень мощная техника, правда, надо умело ее использовать – легко переобучиться. \n",
    "Основная идея – для каждого значения категориального признака посчитать среднее значение целевого признака и заменить категориальный признак на посчитанные средние. Правда, считать средние надо на кросс-валидации, а то легко переобучиться. \n",
    "Но далее я адресую к видео топ-участников соревнований Kaggle, от них можно узнать про эту технику из первых уст. \n",
    "- [Специализация](https://www.coursera.org/specializations/aml) \"Advanced Machine Learning\" на Coursera, [курс](https://www.coursera.org/learn/competitive-data-science)\", How to Win a Data Science Competition: Learn from Top Kagglers\", несколько видео посвящено различным способам построяния признаков с задействованием целевого, и как при этом не переобучиться. Рассказывает Дмитрий Алтухов\n",
    "- [Лекция](https://www.youtube.com/watch?v=g335THJxkto) с презентацией решения конкурса Kaggle BNP paribas, Станислав Семенов\n",
    "\n",
    "Похожая техника [используется](https://tech.yandex.com/catboost/doc/dg/concepts/algorithm-main-stages_cat-to-numberic-docpage/) и в CatBoost.\n",
    "\n",
    "Для начала давайте таким образом закодируем исходные категориальные признаки."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df, y = df.copy(), df[\"y\"]\n",
    "train_df_part, valid_df, y_train_part, y_valid = train_test_split(\n",
    "    train_df.drop(\"y\", axis=1), y, test_size=0.3, stratify=y, random_state=17\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mean_target_enc(train_df, y_train, valid_df, skf):\n",
    "    import warnings\n",
    "\n",
    "    warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "    glob_mean = y_train.mean()\n",
    "    train_df = pd.concat([train_df, pd.Series(y_train, name=\"y\")], axis=1)\n",
    "    new_train_df = train_df.copy()\n",
    "\n",
    "    cat_features = train_df.columns[train_df.dtypes == \"object\"].tolist()\n",
    "\n",
    "    for col in cat_features:\n",
    "        new_train_df[col + \"_mean_target\"] = [\n",
    "            glob_mean for _ in range(new_train_df.shape[0])\n",
    "        ]\n",
    "\n",
    "    for train_idx, valid_idx in skf.split(train_df, y_train):\n",
    "        train_df_cv, valid_df_cv = (\n",
    "            train_df.iloc[train_idx, :],\n",
    "            train_df.iloc[valid_idx, :],\n",
    "        )\n",
    "\n",
    "        for col in cat_features:\n",
    "\n",
    "            means = valid_df_cv[col].map(train_df_cv.groupby(col)[\"y\"].mean())\n",
    "            valid_df_cv[col + \"_mean_target\"] = means.fillna(glob_mean)\n",
    "\n",
    "        new_train_df.iloc[valid_idx] = valid_df_cv\n",
    "\n",
    "    new_train_df.drop(cat_features + [\"y\"], axis=1, inplace=True)\n",
    "\n",
    "    for col in cat_features:\n",
    "        means = valid_df[col].map(train_df.groupby(col)[\"y\"].mean())\n",
    "        valid_df[col + \"_mean_target\"] = means.fillna(glob_mean)\n",
    "\n",
    "    valid_df.drop(train_df.columns[train_df.dtypes == \"object\"], axis=1, inplace=True)\n",
    "\n",
    "    return new_train_df, valid_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_mean_target_part, valid_mean_target = mean_target_enc(\n",
    "    train_df_part, y_train_part, valid_df, skf\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(\n",
    "    cross_val_score(\n",
    "        forest, train_mean_target_part, y_train_part, cv=skf, scoring=\"roc_auc\"\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forest.fit(train_mean_target_part, y_train_part)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "roc_auc_score(y_valid, forest.predict_proba(valid_mean_target)[:, 1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Mean Target + попарные взаимодействия"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df, y = df_interact.drop(\"y\", axis=1).copy(), df_interact[\"y\"]\n",
    "train_df_part, valid_df, y_train_part, y_valid = train_test_split(\n",
    "    train_df, y, test_size=0.3, stratify=y, random_state=17\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_mean_target_part, valid_mean_target = mean_target_enc(\n",
    "    train_df_part, y_train_part, valid_df, skf\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(\n",
    "    cross_val_score(\n",
    "        forest, train_mean_target_part, y_train_part, cv=skf, scoring=\"roc_auc\"\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "forest.fit(train_mean_target_part, y_train_part)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "roc_auc_score(y_valid, forest.predict_proba(valid_mean_target)[:, 1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Опять лучше справляется логистическая регрессия."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(\n",
    "    cross_val_score(\n",
    "        logit, train_mean_target_part, y_train_part, cv=skf, scoring=\"roc_auc\"\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "logit.fit(train_mean_target_part, y_train_part)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "roc_auc_score(y_valid, logit.predict_proba(valid_mean_target)[:, 1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Catboost\n",
    "В библиотеке [Catboost](https://catboost.yandex), помимо всего прочего, реализована как раз техника кодирования категориальных значений средним значением целевого признака. Результаты получаются хорошими именно когда в данных много важных категориальных признаков. Из минусов можно отметить меньшую (пока что) производительность в сравнении с Xgboost и LightGBM."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from catboost import CatBoostClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ctb = CatBoostClassifier(random_seed=17)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df, y = df.drop(\"y\", axis=1), df[\"y\"]\n",
    "train_df_part, valid_df, y_train_part, y_valid = train_test_split(\n",
    "    train_df, y, test_size=0.3, stratify=y, random_state=17\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_features_idx = np.where(train_df_part.dtypes == \"object\")[0].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "cv_scores = []\n",
    "for train_idx, test_idx in skf.split(train_df_part, y_train_part):\n",
    "    cv_train_df, cv_valid_df = (\n",
    "        train_df_part.iloc[train_idx, :],\n",
    "        train_df_part.iloc[test_idx, :],\n",
    "    )\n",
    "    y_cv_train, y_cv_valid = y_train_part.iloc[train_idx], y_train_part.iloc[test_idx]\n",
    "\n",
    "    ctb.fit(cv_train_df, y_cv_train, cat_features=cat_features_idx)\n",
    "\n",
    "    cv_scores.append(roc_auc_score(y_cv_valid, ctb.predict_proba(cv_valid_df)[:, 1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.mean(cv_scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "ctb.fit(train_df_part, y_train_part, cat_features=cat_features_idx);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "roc_auc_score(y_valid, ctb.predict_proba(valid_df)[:, 1])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
